# code to run/conduct analyses on structural topic model

# note: the data necessary to run this code contains sensitive information,
# and is therefore omitted
library(here)

here::here()

source("code/startup.R")
library(tidytext)
library(lubridate)
library(ldatuning)
library(stm)

rfs.matched <- read.csv( "data omitted for privacy")

set.seed(555)
rfs.matched$gen.a <- sapply(rfs.matched$gen.comb.imp, function(x){
  rbinom(1, 1, x)
})

rfs.matched$race.a <- sapply(rfs.matched$race.comb, function(x){
  rbinom(1, 1, x)
})

set.seed(12345)

rfs.matched$college.cat <- relevel(rfs.matched$college.cat, ref = "nonmatched")
rfs.matched$inc.cat <- relevel(rfs.matched$inc.cat, ref = "nonmatched")
rfs.matched$USR <- relevel(rfs.matched$USR, ref = "unk")

# preprocess the documents (this code uses STM's defaults, with other preprocessing steps handled in previous script)
processed <- stm::textProcessor(rfs.matched$whyrun, metadata = rfs.matched)
out <- stm::prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta <-out$meta

# identify ideal number of topics
# cast wide net
rfs_sk.init <- searchK(out$documents, out$vocab, 
                       K = c(10, 20, 30, 40, 50),
                       prevalence =~ gen.comb.imp + race.comb + college.cat + inc.cat +
                         USR + std.age + log_why_wordcount + running, 
                       data = meta, seed = 12345)

# check output
rfs_sk.init$results

# write to table
write.csv(rfs_sk.init$results, file = "output omitted")

# narrow down (kink in residuals is between 10 and 20)
rfs_sk <- searchK(out$documents, out$vocab, 
                  K = c(11:19),
                  prevalence =~ gen.comb.imp + race.comb + college.cat + inc.cat +
                    USR + std.age + log_why_wordcount + running, 
                  data = meta, seed = 12345)

#check output again
rfs_sk$results

# write to table
write.csv(rfs_sk$results, file = "output omitted")


# fit model (17 topics ideal based on above selection method)
rfs_fit <- stm::stm(documents = out$documents, vocab = out$vocab,
                    data = out$meta,
                    K = 17, init.type = "Spectral",
                    prevalence =~ gen.comb.imp + race.comb + college.cat + inc.cat +
                      USR + std.age + log_why_wordcount + running)

rfs_est <- stm::estimateEffect(1:17 ~ gen.comb.imp + race.comb + college.cat + inc.cat +
                                 USR + std.age + log_why_wordcount + running,
                               stmobj = rfs_fit,
                               metadata = out$meta, uncertainty = "Global")
summary(rfs_est, topics = 5) # check results
summary(rfs_est, topics = 11) # check results

rfs_fitsum <- summary(rfs_est, topics = 1:17) # full results

# fits table for the appendix
rfs_fitstable <- bind_cols(data.frame(vars = c("(Intercept)","Male","White",
                                               "College","Non-College", "Matched-Unobserved College",
                                               "Income 100-250k","Income 50-100k","Matched-Unobserved Income",
                                               "Income >250k", "Income <50k",
                                               "Rural","Suburban","Urban",
                                               "Age (sd)","log(Word Count)","Running")),
                           bind_cols(lapply(rfs_fitsum$tables, function(x){
                             dat <- data.frame(round(x[,c(1,2)], 3))
                             dat[,1] <- sapply(1:length(dat[,1]), function(y){
                               ifelse(x[y,4] < .05, paste0(dat[y,1], "*"), dat[y,1])
                             })
                             dat[,2] <- sapply(dat[,2], function(y){
                               paste0("(",y,")")
                             })
                             return(dat)
                           })
                           ))
names(rfs_fitstable) <- c("variable",paste0("Topic 1: ", topiclabs[1]),"", paste0("Topic 2: ", topiclabs[2]),"",
                          paste0("Topic 3: ", topiclabs[3]),"", paste0("Topic 4: ", topiclabs[4]),"", 
                          paste0("Topic 5: ", topiclabs[5]),"", paste0("Topic 6: ", topiclabs[6]),"", 
                          paste0("Topic 7: ", topiclabs[7]),"",paste0("Topic 8: ", topiclabs[8]), "",
                          paste0("Topic 9: ", topiclabs[9]),"", paste0("Topic 10: ", topiclabs[10]),"", 
                          paste0("Topic 11: ", topiclabs[11]),"", paste0("Topic 12: ", topiclabs[12]),"",
                          paste0("Topic 13: ", topiclabs[13]),"",paste0("Topic 14: ", topiclabs[14]),"",
                          paste0("Topic 15: ", topiclabs[15]),"",paste0("Topic 16: ", topiclabs[16]),"",
                          paste0("Topic 17: ", topiclabs[17]),"")
xtable::xtable(t(rfs_fitstable))


plot(rfs_fit, main = "Run for Something Topics by Proportion") # plot topic proportions

labs <- labelTopics(rfs_fit) # squint at topic words

p.corr <- stm::topicCorr(rfs_fit, method = "simple")
plot(p.corr) # squint at topic correlations

# store top 5 representative cases for each topic 
thoughts <- list(NULL)
for(i in 1:17){
  thoughts[[i]] <- stm::findThoughts(rfs_fit, texts = out$meta$whyrun, n = 5, topics = i)
}

# inspect
thoughts[[3]]

thoughtsframe <- data.frame(topic = rep(1:17, each = 5),
                            thought = rep(1:5, 17))
# add top 5 words and docs
thoughtsframe$prob_words <- as.vector(sapply(1:nrow(labs$prob), function(x){
  labs$prob[x,1:5]
}))

thoughtsframe$frex_words <- as.vector(sapply(1:nrow(labs$frex), function(x){
  labs$frex[x,1:5]
}))

thoughtsframe$lift_words <- as.vector(sapply(1:nrow(labs$lift), function(x){
  labs$lift[x,1:5]
}))

thoughtsframe$score_words <- as.vector(sapply(1:nrow(labs$score), function(x){
  labs$score[x,1:5]
}))

thoughtsframe$doc <- unlist(sapply(1:17, function(x){
  thoughts[[x]]$docs[1]
}))

# write to file
write.csv(thoughtsframe, file = "output omitted for privacy")

topiclabs <- c("Educators","Progressives","Education","Make a Difference","Representation",
               "Political Dynamics", "Trump","Previous Interest","Male Identity","Better Future","Preliminary",
               "Change","National Issues","Local Dynamics","Populism","Health","Community")

# version without demographic metadata (only logged wordcount and emergence as prevalence priors)
# identify ideal number of topics
# cast wide net
rfs_sk.nodemo.init <- searchK(out$documents, out$vocab, 
                       K = c(10, 20, 30, 40, 50),
                       prevalence =~ log_why_wordcount + running, 
                       data = meta, seed = 12345)

# check output
rfs_sk.nodemo.init$results

# write to table
write.csv(rfs_sk.nodemo.init$results, file = "output omitted")

# narrow down (kink in residuals is between 10 and 20)
rfs_sk.nodemo <- searchK(out$documents, out$vocab, 
                  K = c(11:19),
                  prevalence =~ log_why_wordcount + running, 
                  data = meta, seed = 12345)

#check output again
rfs_sk.nodemo$results

# write to table
write.csv(rfs_sk.nodemo$results, file = "output omitted")

# visualize some more
source("code/stm_helpers.R")
set.seed(3333)
gendiff.p <- plotSTMdiff(rfs_est, covariate = "gen.comb.imp", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = 1, cov.value2 = 0)

racdiff.p <- plotSTMdiff(rfs_est, covariate = "race.comb", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = 1, cov.value2 = 0)

urdiff.p <- plotSTMdiff(rfs_est, covariate = "USR", topics = c(1:17),
                        model = rfs_fit, method = "difference",
                        cov.value1 = "U", cov.value2 = "R")
sudiff.p <- plotSTMdiff(rfs_est, covariate = "USR", topics = c(1:17),
                        model = rfs_fit, method = "difference",
                        cov.value1 = "S", cov.value2 = "U")

srdiff.p <- plotSTMdiff(rfs_est, covariate = "USR", topics = c(1:17),
                        model = rfs_fit, method = "difference",
                        cov.value1 = "S", cov.value2 = "R")

colldiff.p <- plotSTMdiff(rfs_est, covariate = "college.cat", topics = c(1:17),
                          model = rfs_fit, method = "difference",
                          cov.value1 = "mc", cov.value2 = "mnc")

rundiff.p <- plotSTMdiff(rfs_est, covariate = "running", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = 1, cov.value2 = 0)

incdiff.p <- plotSTMdiff(rfs_est, covariate = "inc.cat", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = "over250", cov.value2 = "under50")

gendiff <- gendiff.p +
  ylim(-.1, .1)+
  labs(title = "Difference by Gender",
       y = "<<< More Female      More Male >>>")+
  scale_x_discrete(name = "Topic",
                   # update labels later
                   labels = topiclabs)

racdiff <- racdiff.p +
  ylim(-.1, .1)+
  labs(title = "Difference by Race",
       y = "<<< More Non-White      More White >>>")+
  scale_x_discrete(name = "",
                   labels = rep("", 17))

urdiff <- urdiff.p +
  ylim(-.1, .1)+
  labs(title = "Difference by Geography",
       y = "<<< More Rural      More Urban >>>")+
  scale_x_discrete(name = "Topic",
                   labels = topiclabs)

colldiff <- colldiff.p +
  ylim(-.1, .1)+
  labs(title = "Difference by Education",
       y = "<<< More Non-College      More College >>>")+
  scale_x_discrete(name = "Topic",
                   labels = topiclabs)

sudiff <- sudiff.p +
  ylim(-.1, .1)+
  labs(title = "Expected Proportion Difference by Geographic Context",
       x = "Topic Number",
       y = "<<< More Urban      More Suburban >>>")
srdiff <- srdiff.p +
  ylim(-.1, .1)+
  labs(title = "Expected Proportion Difference by Geographic Context",
       x = "Topic Number",
       y = "<<< More Rural      More Suburban >>>")

rundiff <- rundiff.p +
  ylim(-.1, .1)+
  labs(title = "Difference by Emergence",
       y = "<<< More Not Running      More Running >>>")+
  scale_x_discrete(name = "",
                   labels = rep("", 17))


ggsave(gendiff, file = "figures/gendiff_matched.png", width = 8, height = 7)
ggsave(racdiff, file = "figures/racdiff_matched.png", width = 8, height = 7)
ggsave(urdiff, file = "figures/urdiff_matched.png", width = 8, height = 7)
ggsave(colldiff, file = "figures/colldiff_matched.png", width = 8, height = 7)
ggsave(rundiff, file = "figures/rundiff_matched.png", width = 8, height = 7)

topicdiffs <- gridExtra::grid.arrange(gendiff, racdiff, colldiff, rundiff,
                                      nrow = 2, ncol = 2, widths = c(1.275, 1))
ggsave(topicdiffs, file = "figures/topicdiffs_matched.png", width = 12, height = 8)

# fit documents back based on model
fits <- make.dt(rfs_fit, meta = out$meta)

fits$max.topic <- sapply(1:nrow(fits), function(x){
  which.max(fits[x,2:18])
})

fits %>% group_by(max.topic) %>% 
  summarise(count = n()) %>%
  ggplot(aes(x = factor(max.topic), y = count))+
  geom_bar(stat = "identity")+
  scale_x_discrete(name = "Count of Docs With Highest Proportion of Words in Topic...", 
                   breaks = c(1:17), labels = topiclabs)

fits_sub <- fits[,c(2:18,26, 34, 35, 39, 42, 45, 47, 48)]

library(ggbeeswarm)

topicdist <- fits_sub %>% dplyr::select(1:18) %>%
  group_by(running) %>%
  reshape2::melt(id.vars = c("running")) %>%
  ggplot(aes(x = variable, y = value))+
  facet_wrap(~running, nrow = 2, ncol = 1, labeller = as_labeller(c("0" = "Non-Candidates", "1" = "Candidates")))+
  geom_quasirandom()+
  labs(y = "Proportion", title = "Distributions of Topic Proportions by Candidate Emergence")+
  scale_x_discrete(name = "Topic",
                   labels = topiclabs)+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=14),
    plot.title = element_text(face = "bold", size = 24),
    axis.text.x=element_text(angle=45,hjust=1, size = 16),
    axis.title.x = element_text(size = 20),
    strip.text = element_text(size = 20)
  )
ggsave(topicdist, file = "figures/topicdist_matched.png", width = 14, height = 10)

proptable <- fits_sub %>% dplyr::select(1:18) %>%
  group_by(running) %>%
  reshape2::melt(id.vars = c("running")) %>%
  group_by(variable) %>%
  summarise(meanprop = mean(value),
            medprop = median(value),
            meanprop_run = mean(value[running == 1]),
            medprop_run = median(value[running == 1])) %>%
  arrange(desc(medprop))
xtable::xtable(proptable)

save.image("output omitted")

# again but test with running as only prior
rfs_sk.init.norun <- searchK(out$documents, out$vocab, 
                       K = c(10, 20, 30, 40),
                       prevalence =~ running, 
                       data = meta, seed = 12345)

rfs_sk_norun <- searchK(out$documents, out$vocab, 
                             K = c(10:20),
                             prevalence =~ running, 
                             data = meta, seed = 12345)

# fit model (17 topics ideal based on above selection method)
rfs_fit_norun <- stm::stm(documents = out$documents, vocab = out$vocab,
                    data = out$meta,
                    K = 17, init.type = "Spectral",
                    prevalence =~ running, seed = 12345)

rfs_est_norun <- stm::estimateEffect(1:17 ~ running + gen.comb.imp,
                               stmobj = rfs_fit_norun,
                               metadata = out$meta, uncertainty = "Global")

rfs_fitsum_norun <- summary(rfs_est_norun, topics = 1:17) # full results


# fit documents back based on model
fits_norun <- make.dt(rfs_fit_norun, meta = out$meta)

fits_sub_norun <- fits_norun[,c(2:18,26, 35, 39, 42, 45, 47, 48)]

write.csv(fits_sub_norun, file = "output omitted")